PM 566 Assignment 1

Author

Sylwia Lipior

Read in the data

#Read in the data
Data_2002 <- data.table::fread('ad_viz_plotval_data_2002.csv')
Data_2022 <- data.table::fread('ad_viz_plotval_data_2022.csv')

summary(Data_2002)
     Date              Source             Site ID              POC       
 Length:15976       Length:15976       Min.   :60010007   Min.   :1.000  
 Class :character   Class :character   1st Qu.:60290014   1st Qu.:1.000  
 Mode  :character   Mode  :character   Median :60590007   Median :1.000  
                                       Mean   :60549600   Mean   :1.581  
                                       3rd Qu.:60731002   3rd Qu.:1.000  
                                       Max.   :61131003   Max.   :6.000  
                                                                         
 Daily Mean PM2.5 Concentration    UNITS           DAILY_AQI_VALUE 
 Min.   :  0.00                 Length:15976       Min.   :  0.00  
 1st Qu.:  7.00                 Class :character   1st Qu.: 29.00  
 Median : 12.00                 Mode  :character   Median : 50.00  
 Mean   : 16.12                                    Mean   : 53.68  
 3rd Qu.: 20.50                                    3rd Qu.: 69.00  
 Max.   :104.30                                    Max.   :176.00  
                                                                   
  Site Name         DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
 Length:15976       Min.   :1       Min.   :100      Min.   :88101     
 Class :character   1st Qu.:1       1st Qu.:100      1st Qu.:88101     
 Mode  :character   Median :1       Median :100      Median :88101     
                    Mean   :1       Mean   :100      Mean   :88215     
                    3rd Qu.:1       3rd Qu.:100      3rd Qu.:88502     
                    Max.   :1       Max.   :100      Max.   :88502     
                                                                       
 AQS_PARAMETER_DESC   CBSA_CODE      CBSA_NAME           STATE_CODE
 Length:15976       Min.   :12540   Length:15976       Min.   :6   
 Class :character   1st Qu.:23420   Class :character   1st Qu.:6   
 Mode  :character   Median :40140   Mode  :character   Median :6   
                    Mean   :33270                      Mean   :6   
                    3rd Qu.:41740                      3rd Qu.:6   
                    Max.   :49700                      Max.   :6   
                    NA's   :929                                    
    STATE            COUNTY_CODE        COUNTY          SITE_LATITUDE  
 Length:15976       Min.   :  1.00   Length:15976       Min.   :32.63  
 Class :character   1st Qu.: 29.00   Class :character   1st Qu.:34.07  
 Mode  :character   Median : 59.00   Mode  :character   Median :35.36  
                    Mean   : 54.78                      Mean   :36.00  
                    3rd Qu.: 73.00                      3rd Qu.:37.77  
                    Max.   :113.00                      Max.   :41.71  
                                                                       
 SITE_LONGITUDE  
 Min.   :-124.2  
 1st Qu.:-121.4  
 Median :-119.1  
 Mean   :-119.4  
 3rd Qu.:-117.9  
 Max.   :-115.5  
                 
summary(Data_2022)
     Date              Source             Site ID              POC        
 Length:57775       Length:57775       Min.   :60010007   Min.   : 1.000  
 Class :character   Class :character   1st Qu.:60311004   1st Qu.: 1.000  
 Mode  :character   Mode  :character   Median :60631007   Median : 3.000  
                                       Mean   :60571692   Mean   : 2.531  
                                       3rd Qu.:60771003   3rd Qu.: 3.000  
                                       Max.   :61131003   Max.   :21.000  
                                                                          
 Daily Mean PM2.5 Concentration    UNITS           DAILY_AQI_VALUE 
 Min.   : -2.200                Length:57775       Min.   :  0.00  
 1st Qu.:  4.200                Class :character   1st Qu.: 18.00  
 Median :  7.000                Mode  :character   Median : 29.00  
 Mean   :  8.574                                   Mean   : 32.95  
 3rd Qu.: 10.900                                   3rd Qu.: 45.00  
 Max.   :302.500                                   Max.   :353.00  
                                                                   
  Site Name         DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
 Length:57775       Min.   :1       Min.   :100      Min.   :88101     
 Class :character   1st Qu.:1       1st Qu.:100      1st Qu.:88101     
 Mode  :character   Median :1       Median :100      Median :88101     
                    Mean   :1       Mean   :100      Mean   :88196     
                    3rd Qu.:1       3rd Qu.:100      3rd Qu.:88101     
                    Max.   :1       Max.   :100      Max.   :88502     
                                                                       
 AQS_PARAMETER_DESC   CBSA_CODE      CBSA_NAME           STATE_CODE
 Length:57775       Min.   :12540   Length:57775       Min.   :6   
 Class :character   1st Qu.:31080   Class :character   1st Qu.:6   
 Mode  :character   Median :40140   Mode  :character   Median :6   
                    Mean   :35447                      Mean   :6   
                    3rd Qu.:41860                      3rd Qu.:6   
                    Max.   :49700                      Max.   :6   
                    NA's   :4761                                   
    STATE            COUNTY_CODE        COUNTY          SITE_LATITUDE  
 Length:57775       Min.   :  1.00   Length:57775       Min.   :32.58  
 Class :character   1st Qu.: 31.00   Class :character   1st Qu.:34.14  
 Mode  :character   Median : 63.00   Mode  :character   Median :36.60  
                    Mean   : 57.02                      Mean   :36.37  
                    3rd Qu.: 77.00                      3rd Qu.:38.10  
                    Max.   :113.00                      Max.   :41.76  
                                                                       
 SITE_LONGITUDE  
 Min.   :-124.2  
 1st Qu.:-121.5  
 Median :-119.8  
 Mean   :-119.7  
 3rd Qu.:-118.1  
 Max.   :-115.5  
                 
head(Data_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/05/2002    AQS 60010007   1                           25.1 ug/m3 LC
2: 01/06/2002    AQS 60010007   1                           31.6 ug/m3 LC
3: 01/08/2002    AQS 60010007   1                           21.4 ug/m3 LC
4: 01/11/2002    AQS 60010007   1                           25.9 ug/m3 LC
5: 01/14/2002    AQS 60010007   1                           34.5 ug/m3 LC
6: 01/17/2002    AQS 60010007   1                           41.0 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              78 Livermore               1              100
2:              92 Livermore               1              100
3:              71 Livermore               1              100
4:              80 Livermore               1              100
5:              98 Livermore               1              100
6:             115 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
head(Data_2022)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 01/01/2022    AQS 60010007   3                           12.7 ug/m3 LC
2: 01/02/2022    AQS 60010007   3                           13.9 ug/m3 LC
3: 01/03/2022    AQS 60010007   3                            7.1 ug/m3 LC
4: 01/04/2022    AQS 60010007   3                            3.7 ug/m3 LC
5: 01/05/2022    AQS 60010007   3                            4.2 ug/m3 LC
6: 01/06/2022    AQS 60010007   3                            3.8 ug/m3 LC
   DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              52 Livermore               1              100
2:              55 Livermore               1              100
3:              30 Livermore               1              100
4:              15 Livermore               1              100
5:              18 Livermore               1              100
6:              16 Livermore               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     41860
2:              88101 PM2.5 - Local Conditions     41860
3:              88101 PM2.5 - Local Conditions     41860
4:              88101 PM2.5 - Local Conditions     41860
5:              88101 PM2.5 - Local Conditions     41860
6:              88101 PM2.5 - Local Conditions     41860
                           CBSA_NAME STATE_CODE      STATE COUNTY_CODE  COUNTY
1: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
2: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
3: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
4: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
5: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
6: San Francisco-Oakland-Hayward, CA          6 California           1 Alameda
   SITE_LATITUDE SITE_LONGITUDE
1:      37.68753      -121.7842
2:      37.68753      -121.7842
3:      37.68753      -121.7842
4:      37.68753      -121.7842
5:      37.68753      -121.7842
6:      37.68753      -121.7842
tail(Data_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
2: 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
3: 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
4: 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
5: 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
6: 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              57 Woodland-Gibson Road               1              100
2:              57 Woodland-Gibson Road               1              100
3:               4 Woodland-Gibson Road               1              100
4:              74 Woodland-Gibson Road               1              100
5:              21 Woodland-Gibson Road               1              100
6:              25 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327
tail(Data_2002)
         Date Source  Site ID POC Daily Mean PM2.5 Concentration    UNITS
1: 12/10/2002    AQS 61131003   1                             15 ug/m3 LC
2: 12/13/2002    AQS 61131003   1                             15 ug/m3 LC
3: 12/22/2002    AQS 61131003   1                              1 ug/m3 LC
4: 12/25/2002    AQS 61131003   1                             23 ug/m3 LC
5: 12/28/2002    AQS 61131003   1                              5 ug/m3 LC
6: 12/31/2002    AQS 61131003   1                              6 ug/m3 LC
   DAILY_AQI_VALUE            Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
1:              57 Woodland-Gibson Road               1              100
2:              57 Woodland-Gibson Road               1              100
3:               4 Woodland-Gibson Road               1              100
4:              74 Woodland-Gibson Road               1              100
5:              21 Woodland-Gibson Road               1              100
6:              25 Woodland-Gibson Road               1              100
   AQS_PARAMETER_CODE       AQS_PARAMETER_DESC CBSA_CODE
1:              88101 PM2.5 - Local Conditions     40900
2:              88101 PM2.5 - Local Conditions     40900
3:              88101 PM2.5 - Local Conditions     40900
4:              88101 PM2.5 - Local Conditions     40900
5:              88101 PM2.5 - Local Conditions     40900
6:              88101 PM2.5 - Local Conditions     40900
                                 CBSA_NAME STATE_CODE      STATE COUNTY_CODE
1: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
2: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
3: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
4: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
5: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
6: Sacramento--Roseville--Arden-Arcade, CA          6 California         113
   COUNTY SITE_LATITUDE SITE_LONGITUDE
1:   Yolo      38.66121      -121.7327
2:   Yolo      38.66121      -121.7327
3:   Yolo      38.66121      -121.7327
4:   Yolo      38.66121      -121.7327
5:   Yolo      38.66121      -121.7327
6:   Yolo      38.66121      -121.7327
dim(Data_2002)
[1] 15976    20
dim(Data_2022)
[1] 57775    20
str(Data_2002)
Classes 'data.table' and 'data.frame':  15976 obs. of  20 variables:
 $ Date                          : chr  "01/05/2002" "01/06/2002" "01/08/2002" "01/11/2002" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Daily Mean PM2.5 Concentration: num  25.1 31.6 21.4 25.9 34.5 41 29.3 15 18.8 37.9 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  78 92 71 80 98 115 87 57 65 107 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
str(Data_2022)
Classes 'data.table' and 'data.frame':  57775 obs. of  20 variables:
 $ Date                          : chr  "01/01/2022" "01/02/2022" "01/03/2022" "01/04/2022" ...
 $ Source                        : chr  "AQS" "AQS" "AQS" "AQS" ...
 $ Site ID                       : int  60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
 $ POC                           : int  3 3 3 3 3 3 3 3 3 3 ...
 $ Daily Mean PM2.5 Concentration: num  12.7 13.9 7.1 3.7 4.2 3.8 2.3 6.9 13.6 11.2 ...
 $ UNITS                         : chr  "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
 $ DAILY_AQI_VALUE               : int  52 55 30 15 18 16 10 29 54 47 ...
 $ Site Name                     : chr  "Livermore" "Livermore" "Livermore" "Livermore" ...
 $ DAILY_OBS_COUNT               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ PERCENT_COMPLETE              : num  100 100 100 100 100 100 100 100 100 100 ...
 $ AQS_PARAMETER_CODE            : int  88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
 $ AQS_PARAMETER_DESC            : chr  "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
 $ CBSA_CODE                     : int  41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
 $ CBSA_NAME                     : chr  "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
 $ STATE_CODE                    : int  6 6 6 6 6 6 6 6 6 6 ...
 $ STATE                         : chr  "California" "California" "California" "California" ...
 $ COUNTY_CODE                   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ COUNTY                        : chr  "Alameda" "Alameda" "Alameda" "Alameda" ...
 $ SITE_LATITUDE                 : num  37.7 37.7 37.7 37.7 37.7 ...
 $ SITE_LONGITUDE                : num  -122 -122 -122 -122 -122 ...
 - attr(*, ".internal.selfref")=<externalptr> 
# get variable names
names(Data_2002)
 [1] "Date"                           "Source"                        
 [3] "Site ID"                        "POC"                           
 [5] "Daily Mean PM2.5 Concentration" "UNITS"                         
 [7] "DAILY_AQI_VALUE"                "Site Name"                     
 [9] "DAILY_OBS_COUNT"                "PERCENT_COMPLETE"              
[11] "AQS_PARAMETER_CODE"             "AQS_PARAMETER_DESC"            
[13] "CBSA_CODE"                      "CBSA_NAME"                     
[15] "STATE_CODE"                     "STATE"                         
[17] "COUNTY_CODE"                    "COUNTY"                        
[19] "SITE_LATITUDE"                  "SITE_LONGITUDE"                
names(Data_2022)
 [1] "Date"                           "Source"                        
 [3] "Site ID"                        "POC"                           
 [5] "Daily Mean PM2.5 Concentration" "UNITS"                         
 [7] "DAILY_AQI_VALUE"                "Site Name"                     
 [9] "DAILY_OBS_COUNT"                "PERCENT_COMPLETE"              
[11] "AQS_PARAMETER_CODE"             "AQS_PARAMETER_DESC"            
[13] "CBSA_CODE"                      "CBSA_NAME"                     
[15] "STATE_CODE"                     "STATE"                         
[17] "COUNTY_CODE"                    "COUNTY"                        
[19] "SITE_LATITUDE"                  "SITE_LONGITUDE"                

Both of the data sets have the same 20 variables. Data_2002 has 15976 observations, while Data_2022 has 57775 observations (almost 3 times as many).

#To vertically concetenate the data sets

Data_2002$Year <- 2002

Data_2022$Year <- 2022

all_data <- rbind(Data_2002, Data_2022)

# Change Daily Mean PM2.5 Concentration to PM2.5
names(all_data)[names(all_data) == "Daily Mean PM2.5 Concentration"] <- "PM2.5"
names(all_data)[names(all_data) == "SITE_LATITUDE"] <- "lat"
names(all_data)[names(all_data) == "SITE_LONGITUDE"] <- "lon"
names(all_data)[names(all_data) == "Site Name"] <- "Site_Name"

Leaflet map

library(leaflet)
library(leaflet.extras)

# Create a color palette with custom colors
color_pal <- colorNumeric(
  palette = c("blue", "red"),  # Define custom colors
  domain = all_data$Year  # Set the range of values
)

# Create a leaflet map
map <- leaflet(data = all_data) %>%
  addProviderTiles("CartoDB.Positron") %>%  
  addCircleMarkers(
    lat = ~lat,
    lng = ~lon,
    radius = 3,
    fillOpacity = 0.8,
    color = ~color_pal(Year),  # Assign colors based on relative humidity values
  ) %>%
  addLegend(
    title = "Site Location by Year",
    colors = c("blue", "red"),  # Custom colors
    labels = c("2002", "2022"),  # Color labels
    opacity = 3,
    position = "bottomleft"
  )

# Show the map
map

There are many more locations in 2022 than in 2002. There are many more sites around/near the large coastal cities of San Diego, Los Angeles, and San Francisco.

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
all_data$Date <- mdy(all_data$Date)

# Assuming you have a date or timestamp variable called "DateTime"

# Create a histogram of PM2.5 values
ggplot(all_data, aes(x = PM2.5)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  labs(title = "Distribution of PM2.5 Values",
       x = "PM2.5 Value", y = "Frequency") +
  theme_minimal()

# Create a new data frame with counts of missing and implausible values by date
summary_data <- all_data %>%
  mutate(Implausible = PM2.5 < 0 | PM2.5 > 500) %>%
  group_by(Date = as.Date(Date)) %>%
  summarise(Count_Implausible = sum(Implausible))

# Create a line plot to visualize the counts over time
ggplot(summary_data, aes(x = Date)) +
  geom_line(aes(y = Count_Implausible, color = "Implausible")) +
  labs(title = "Implausible PM2.5 Values Over Time",
       x = "Date", y = "Count") +
  scale_color_manual(values = c("Missing" = "red", "Implausible" = "blue")) +
  theme_minimal()

There are 143 implausible data points, they occur largely in the beginning of the data set and at the end. A PM 2.5 of greater than 500 seems implausible and all of those values occur in 2022 which could possibly be explained by more testing sites, more measurements being taken, or issues with instrumentation.

# Load necessary libraries
library(ggplot2)
library(dplyr)

# State Level Analysis
state_summary <- all_data %>%
  group_by(Year) %>%
  summarise(
    Mean_PM2.5 = mean(PM2.5, na.rm = TRUE),
    SD_PM2.5 = sd(PM2.5, na.rm = TRUE)
  )

# Plot: Barplot of average PM2.5 levels by year
ggplot(state_summary, aes(x = as.factor(Year), y = Mean_PM2.5)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average PM2.5 Levels by Year in California", x = "Year", y = "Mean PM2.5") +
  theme_minimal()

# Plot: Histogram of PM2.5 levels within California
ggplot(all_data, aes(x = PM2.5)) +
  geom_histogram(binwidth = 2, fill = "lightgreen") +
  labs(title = "PM2.5 Distribution in California", x = "PM2.5 Levels") +
  theme_minimal()

# County level analysis
county_summary <- all_data %>%
  group_by(COUNTY) %>%
  summarise(
    Mean_PM2.5 = mean(PM2.5, na.rm = TRUE),
    SD_PM2.5 = sd(PM2.5, na.rm = TRUE),
    Median_PM2.5 = median(PM2.5, na.rm = TRUE)
  )

# Plot: Create boxplots for each county to visualize PM2.5 distribution
ggplot(all_data, aes(x = COUNTY, y = PM2.5)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "PM2.5 Distribution by County", x = "County", y = "PM2.5 Levels") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better readability

# Site Level Analysis (Los Angeles County)
la_county_data <- all_data %>%
  filter(COUNTY == "Los Angeles")

# Summary Statistic: Mean and SD of PM2.5 levels at monitoring sites in LA County
la_summary <- la_county_data %>%
  group_by(Year) %>%
  summarise(
    Mean_PM2.5 = mean(PM2.5, na.rm = TRUE),
    SD_PM2.5 = sd(PM2.5, na.rm = TRUE)
  )

# Define custom colors for LA County and State
la_color <- "red"
state_color <- "blue"

# Plot: Barplot of average PM2.5 levels by year for Los Angeles County and the entire state
ggplot() +
  geom_bar(data = la_summary, aes(x = as.factor(Year), y = Mean_PM2.5, fill = "LA County"), stat = "identity", position = "dodge") +
  geom_bar(data = state_summary, aes(x = as.factor(Year), y = Mean_PM2.5, fill = "State"), stat = "identity", position = "dodge") +
  labs(title = "Average PM2.5 Levels in LA County and California", x = "Year", y = "Mean PM2.5") +
  scale_fill_manual(values = c("LA County" = la_color, "State" = state_color)) +
  theme_minimal() +
  guides(fill = guide_legend(title = "Location"))

At the state level, we look at the histogram of PM 2.5 levels in California, and find that the data is distributed with the most counts around 10.

At the county level, we look at the PM 2.5 distribution by county. We find that counties like Placer, Nevada, Trinity, and Siskiyou have some of the highest PM 2.5 levels in the state.

At the LA county level, we look at the over PM 2.5 level in 2002 and 2022 and find that LA county is higher than the state average in both years, but both the state and the county average is lower in 2022.